by Saravanan Natarajan
Real-world data rarely comes clean. Using Python and its libraries, you will gather data from a variety of sources and in a variety of formats, assess its quality and tidiness, then clean it. This is called data wrangling.
Wrangling involves Wow- worthy analyses and visualizations.
import pandas as pd
import datetime as dt
import numpy as np
import requests
import sqlalchemy
import tweepy
import json
import re
import time
from PIL import Image
from io import BytesIO
def get_df_list(df):
return list(df)
def get_df_info(df):
df.info()
def get_df_describe(df):
return df.describe()
def get_df_sample(df, sample_value):
return df.sample(sample_value)
def get_df_value_counts(df):
return df.value_counts()
def get_df_len(df):
return len(df)
def get_df_head(df, count):
return df.head(count)
df_weratedogs_archive = pd.read_csv('twitter-archive-enhanced.csv')
udacity_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
url_response = requests.get(udacity_url)
with open(udacity_url.split('/')[-1], mode='wb') as file:
file.write(url_response.content)
df_nn_image_predictions = pd.read_csv('image-predictions.tsv', sep='\t')
access_token = "YOUR ACCESS TOKEN"
access_token_secret = "YOUR ACCESS SECRET"
consumer_key = "YOUR CONSUMER KEY"
consumer_secret = "YOUR CONSUMER SECRET"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser(),
wait_on_rate_limit=True,
wait_on_rate_limit_notify = True)
get_df_head(df_weratedogs_archive, 2)
Interested in additional data such as date and time of the creation of the tweet and utility used to post the tweet.
df_list = []
error_list = []
start = time.time()
for tweet_id in df_weratedogs_archive.tweet_id:
try:
page = api.get_status(tweet_id, tweet_mode = 'extended')
favorites = page['favorite_count']
retweets = page['retweet_count']
date_time = page['created_at']
source = page['source']
df_list.append({'tweet_id': int(tweet_id),
'favorites': int(favorites),
'retweets': int(retweets),
'date_time': pd.to_datetime(date_time),
'utility_source' : str(source)})
except Exception as e:
print(str(tweet_id)+ " _ " + str(e))
error_list.append(tweet_id)
end = time.time()
print(end - start)
The tweets corresponding to a few tweet IDs in the archive may have been deleted were captured in error_list
error_list
print("The lengh of the result", len(df_list))
print("The lengh of the errors", len(error_list))
From the above results:
ee_list = []
for e in error_list:
try:
favorites = page['favorite_count']
retweets = page['retweet_count']
date_time = page['created_at']
source = page['source']
df_list.append({'tweet_id': int(tweet_id),
'favorites': int(favorites),
'retweets': int(retweets),
'date_time': pd.to_datetime(date_time),
'utility_source' : str(source)})
except Exception:
print(str(tweet_id)+ " _ " + str(e))
ee_list.append(e)
print("The lengh of the result after Querying the errors separately", len(df_list))
print("The lengh of the errors", len(ee_list))
During cleaning we can remove the error list
df_json_tweets = pd.DataFrame(df_list, columns = ['tweet_id', 'favorites', 'retweets',
'date_time', 'utility_source'])
df_json_tweets.to_csv('tweet_json.txt', encoding = 'utf-8', index=False)
df_json_tweets = pd.read_csv('tweet_json.txt', encoding = 'utf-8')
get_df_head(df_json_tweets, 5)
The file tweet_json.txt successfully saved in our working directory contains the result of the API Querying then stored in df_json_tweets data frame, it has 6 columns and 2356 entries
Gathering is the first step in the data wrangling process. We could finish the high-level gathering process:
Obtaining data
get_df_list(df_weratedogs_archive)
get_df_info(df_weratedogs_archive)
get_df_describe(df_weratedogs_archive)
get_df_sample(df_weratedogs_archive, 7)
WeRateDogs is popular because of the rating system, we will view rating numerators and denominators
get_df_value_counts(df_weratedogs_archive.rating_numerator)
get_df_value_counts(df_weratedogs_archive.rating_denominator)
get_df_value_counts(df_weratedogs_archive.name)
Randomly assess the data
print(df_weratedogs_archive.text[800])
print(df_weratedogs_archive.name[220])
print(df_weratedogs_archive[df_weratedogs_archive.rating_numerator == 0])
get_df_list(df_nn_image_predictions)
get_df_info(df_nn_image_predictions)
get_df_describe(df_nn_image_predictions)
get_df_sample(df_nn_image_predictions, 3)
get_df_value_counts(df_nn_image_predictions.p1_conf)
df_nn_image_predictions[df_nn_image_predictions['jpg_url'] ==
'https://pbs.twimg.com/media/CZhn-QAWwAASQan.jpg']
print(df_nn_image_predictions [df_nn_image_predictions.tweet_id.isnull()])
print(df_nn_image_predictions[df_nn_image_predictions.tweet_id.duplicated()])
Based on reference we further assess the prediction data
envelope_image_url = df_nn_image_predictions.jpg_url[df_nn_image_predictions.
tweet_id==860924035999428608].iloc[0]
request_image = requests.get(envelope_image_url)
image = Image.open(BytesIO(request_image.content))
image
At p1 it was predicted as envelope, then oscilloscope and finally paper_towel. Same we can try for box_turtle below
envelope_image_url = df_nn_image_predictions.jpg_url[df_nn_image_predictions.
tweet_id==870063196459192321].iloc[0]
request_image = requests.get(envelope_image_url)
image = Image.open(BytesIO(request_image.content))
image
Even Clifford was added in prediction
box_turtle_image_url = df_nn_image_predictions.jpg_url[df_nn_image_predictions.
tweet_id==880935762899988482].iloc[0]
request_image_box_turtle = requests.get(box_turtle_image_url)
image = Image.open(BytesIO(request_image_box_turtle.content))
image
Prediction involves picture and real dog
box_turtle_image_url = df_nn_image_predictions.jpg_url[df_nn_image_predictions.
tweet_id==666051853826850816].iloc[0]
request_image_box_turtle = requests.get(box_turtle_image_url)
image = Image.open(BytesIO(request_image_box_turtle.content))
image
It's a turtle
get_df_list(df_json_tweets)
get_df_info(df_json_tweets)
get_df_sample(df_json_tweets, 5)
Identify duplicates in all three dataframes
df_all_columns = pd.Series(list(df_weratedogs_archive) +
list(df_nn_image_predictions) +
list(df_weratedogs_archive))
df_all_columns[df_all_columns.duplicated()]
Assess data quality and tidiness
WeRateDogs Archive table¶Image prediction table¶Twitter JSON table¶(structural issues)
WeRateDogs Archive, Image prediction and Twitter JSON tables should be part of one dataframeWeRateDogs Archive dataframeWeRateDogs Archive dataframe can have a common name.df_clean_weratedogs_archive = df_weratedogs_archive.copy()
df_clean_nn_image_predictions = df_nn_image_predictions.copy()
df_clean_json_tweets = df_json_tweets.copy()
Change timestamp and retweeted_status_timestamp to datetimes
df_clean_weratedogs_archive.timestamp = pd.to_datetime(
df_clean_weratedogs_archive.timestamp)
df_clean_weratedogs_archive.retweeted_status_timestamp = pd.to_datetime(
df_clean_weratedogs_archive.retweeted_status_timestamp)
get_df_info(df_clean_weratedogs_archive)
name columns can't possibly be dog names. (a, an ...)
wrong_names = []
for idx, name in enumerate(df_clean_weratedogs_archive.name):
if name.islower() or name == 'None':
j = re.compile('(?:named\s|name\sis\s)(\w+)')
if j.findall(df_clean_weratedogs_archive.loc[idx,'text']):
df_clean_weratedogs_archive.loc[idx,'name'] = j.findall(
df_clean_weratedogs_archive.loc[idx,'text'])[0]
else:
df_clean_weratedogs_archive.loc[idx,'name'] = 'None'
get_df_value_counts(df_clean_weratedogs_archive.name)
In the df_weratedogs_archive there were name "a" count 55 and "an" count 7 after cleaning the data there is no more "a" and "an"
missing expanded_urls
def add_urls(row):
if pd.notnull(row['expanded_urls']):
return row
else:
tweet_id = row['tweet_id']
row['expanded_urls'] = 'https://twitter.com/dog_rates/status/{}'.format(
tweet_id)
return row
df_clean_weratedogs_archive = df_clean_weratedogs_archive.apply(add_urls, axis=1)
df_clean_weratedogs_archive[df_clean_weratedogs_archive['expanded_urls'].isnull()]
In df_clean_weratedogs_archive there is no null expanded_urls
The numerator and denominator columns have invalid values. Replace ratings manually by looking at text from tweet
problem_entries_indexes = [x-1 for x in [1069, 1166, 2336]]
problem_entries = df_clean_weratedogs_archive.iloc[problem_entries_indexes]
for txt in problem_entries.text:
print(txt)
df_clean_weratedogs_archive.at[problem_entries_indexes[0],
'rating_numerator'] = 14
df_clean_weratedogs_archive.at[problem_entries_indexes[0],
'rating_denominator'] = 10
df_clean_weratedogs_archive.at[problem_entries_indexes[1],
'rating_numerator'] = 13
df_clean_weratedogs_archive.at[problem_entries_indexes[1],
'rating_denominator'] = 10
df_clean_weratedogs_archive.at[problem_entries_indexes[2],
'rating_numerator'] = 9
df_clean_weratedogs_archive.at[problem_entries_indexes[2],
'rating_denominator'] = 10
problem_entries = df_clean_weratedogs_archive.iloc[problem_entries_indexes]
values = zip(problem_entries.text, problem_entries.rating_numerator,
problem_entries.rating_denominator)
for (txt, n, d) in values:
print('{}/{} from: \n\t"{}"'.format(n,d,txt))
Make all strings in p1, p2, and p3 lowercase
df_clean_nn_image_predictions['p1'] = df_clean_nn_image_predictions['p1'].str.lower()
df_clean_nn_image_predictions['p2'] = df_clean_nn_image_predictions['p2'].str.lower()
df_clean_nn_image_predictions['p3'] = df_clean_nn_image_predictions['p3'].str.lower()
get_df_head(df_clean_nn_image_predictions,2)
Cleaning our data is the third step in data wrangling. It is where we will fix the quality and tidiness issues that we identified in the assess step.
df_weratedogs_master = pd.merge(df_clean_weratedogs_archive,
df_clean_nn_image_predictions,
how = 'left', on = ['tweet_id'] )
df_weratedogs_master = pd.merge(df_weratedogs_master,
df_clean_json_tweets,
how = 'left', on = ['tweet_id'])
df_weratedogs_master.to_csv('weratedogs_master.csv', encoding = 'utf-8')
get_df_info(df_weratedogs_master)
Delete the retweets, delete duplicated tweet_id and delete tweets with no pictures
df_weratedogs_master = df_weratedogs_master[pd.isnull(
df_weratedogs_master.retweeted_status_id)]
df_weratedogs_master = df_weratedogs_master.drop_duplicates()
df_weratedogs_master = df_weratedogs_master.dropna(subset = ['jpg_url'])
get_df_len(df_weratedogs_master)
Drop columns related to retweet and drop column date_time we imported from the API
df_weratedogs_master = df_weratedogs_master.drop('retweeted_status_id', 1)
df_weratedogs_master = df_weratedogs_master.drop('retweeted_status_user_id', 1)
df_weratedogs_master = df_weratedogs_master.drop('retweeted_status_timestamp', 1)
df_weratedogs_master = df_weratedogs_master.drop('date_time', 1)
get_df_list(df_weratedogs_master)
dog_breed = []
conf = []
def breed_conf (row):
if row['p1_dog']:
dog_breed.append(row['p1'])
conf.append(row['p1_conf'])
elif row['p2_dog']:
dog_breed.append(row['p2'])
conf.append(row['p2_conf'])
elif row['p3_dog']:
dog_breed.append(row['p3'])
conf.append(row['p3_conf'])
else:
dog_breed.append('Unidentifiable')
conf.append(0)
df_weratedogs_master.apply(breed_conf, axis = 1)
df_weratedogs_master['dog_breed'] = dog_breed
df_weratedogs_master['confidence'] = conf
df_weratedogs_master.drop(['p1', 'p1_conf', 'p1_dog',
'p2', 'p2_conf', 'p2_dog',
'p3', 'p3_conf', 'p3_dog',],
axis=1, inplace=True)
get_df_info(df_weratedogs_master)
get_df_value_counts(df_weratedogs_master.source)
The result of the above lines says:
source has 3 types, we will clean that column and made them clear
Drop the following columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'utility_source' clean the content of source column
df_weratedogs_master = df_weratedogs_master.drop(
['in_reply_to_status_id', 'in_reply_to_user_id',
'utility_source'], 1)
df_weratedogs_master['source'] = df_weratedogs_master['source'].apply(
lambda x: re.findall(r'>(.*)<', x)[0])
get_df_head(df_weratedogs_master,2)
'doggo','floofer', 'pupper' and 'puppo' in WeRateDogs Archive dataframe can have a common name.
dog_type = []
x = ['pupper', 'puppo', 'doggo', 'floof']
y = ['pupper', 'puppo', 'doggo', 'floof']
for row in df_weratedogs_master['text']:
row = row.lower()
for word in x:
if word in str(row):
dog_type.append(y[x.index(word)])
break
else:
dog_type.append('None')
df_weratedogs_master['dog_type'] = dog_type
df_weratedogs_master.drop(['doggo', 'floofer', 'pupper', 'puppo'],
axis=1, inplace=True)
get_df_info(df_weratedogs_master)
Add dog_names column
dog_names = []
for text in df_weratedogs_master['text']:
if text.startswith('This is ') and re.match(r'[A-Z].*',
text.split()[2]):
dog_names.append(text.split()[2].
strip(',').
strip('.'))
elif text.startswith('Meet ') and re.match(r'[A-Z].*',
text.split()[1]):
dog_names.append(text.split()[1].
strip(',').
strip('.'))
elif text.startswith('Say hello to ') and re.match(r'[A-Z].*',
text.split()[3]):
dog_names.append(text.split()[3].
strip(',').
strip('.'))
elif text.startswith('Here we have ') and re.match(r'[A-Z].*',
text.split()[3]):
dog_names.append(text.split()[3].
strip(',').
strip('.'))
elif 'named' in text and re.match(r'[A-Z].*',
text.split()[text.split().
index('named') + 1]):
dog_names.append(text.split()[text.split().index('named') + 1].
strip(',').
strip('.'))
else:
dog_names.append('NaN')
df_weratedogs_master['dog_name'] = dog_names
get_df_info(df_weratedogs_master)
Identify dog gender and add the column
male = ['He', 'he', 'him', 'his', "he's", 'himself']
female = ['She', 'she', 'her', 'hers', 'herself', "she's"]
dog_gender = []
for text in df_weratedogs_master['text']:
if any(map(lambda v:v in male, text.split())):
dog_gender.append('male')
elif any(map(lambda v:v in female, text.split())):
dog_gender.append('female')
else:
dog_gender.append('NaN')
df_weratedogs_master['dog_gender'] = dog_gender
len(dog_gender)
print("dog_gender count \n", get_df_value_counts(
df_weratedogs_master.dog_gender))
Replace NaN by None
df_weratedogs_master.loc[df_weratedogs_master
['dog_name'] == 'NaN',
'dog_name'] = None
df_weratedogs_master.loc[df_weratedogs_master
['dog_gender'] == 'NaN',
'dog_gender'] = None
Match the data type before storing
df_weratedogs_master.dtypes
df_weratedogs_master['tweet_id'] = df_weratedogs_master[
'tweet_id'].astype(object)
df_weratedogs_master['source'] = df_weratedogs_master[
'source'].astype('category')
df_weratedogs_master['rating_numerator'] = df_weratedogs_master[
'rating_numerator'].astype(float)
df_weratedogs_master['rating_denominator'] = df_weratedogs_master[
'rating_denominator'].astype(float)
df_weratedogs_master['favorites'] = df_weratedogs_master[
'favorites'].astype(int)
df_weratedogs_master['retweets'] = df_weratedogs_master[
'retweets'].astype(int)
df_weratedogs_master['dog_type'] = df_weratedogs_master[
'dog_type'].astype('category')
df_weratedogs_master['dog_gender'] = df_weratedogs_master[
'dog_gender'].astype('category')
df_weratedogs_master['dog_name'] = df_weratedogs_master[
'dog_name'].astype('category')
df_weratedogs_master.dtypes
df_weratedogs_master.to_csv('twitter_archive_master.csv',
index=False, encoding = 'utf-8')
df_weratedogs_master = pd.read_csv('twitter_archive_master.csv')
df_weratedogs_master.info()
engine = sqlalchemy.create_engine('sqlite:///twitter_archive_master.db')
df_weratedogs_master.to_sql('master', engine, index=False)
df_gather = pd.read_sql('SELECT * FROM master', engine)
get_df_head(df_gather, 3)